Python for Bioinformatics

This Jupyter notebook is intented to be used alongside the book Python for Bioinformatics

Chapter 3: Basic Programming: Data Types

STRINGS


In [2]:
"This is a string in Python"
'This is a string in Python'
'''This is a string in Python'''
"""This is a string in Python"""


Out[2]:
'This is a string in Python'

In [4]:
"A single quote (’) inside a double quote"
'Here we have "double quotes" inside single quotes'


Out[4]:
'Here we have "double quotes" inside single quotes'

In [5]:
"Mixing quotes leads to the dark side'


  File "<ipython-input-5-392baaada2f8>", line 1
    "Mixing quotes leads to the dark side'
                                          ^
SyntaxError: EOL while scanning string literal

In [6]:
"""Hi! I'm a
multiline
          string"""


Out[6]:
"Hi! I'm a\nmultiline\n          string"

In [7]:
"Hi! I'm a\nmultiline\n          string"


Out[7]:
"Hi! I'm a\nmultiline\n          string"

Strings are sequences of Unicode characters


In [4]:
'In Python 3, strings are Unicode: こんにちは 世界'


Out[4]:
'In Python 3, strings are Unicode: こんにちは 世界'

String Manipulation


In [4]:
signal_peptide = 'MASKATLLLAFTLLFATCIA'

In [5]:
signal_peptide.lower()


Out[5]:
'maskatlllaftllfatcia'

In [6]:
signal_peptide


Out[6]:
'MASKATLLLAFTLLFATCIA'

In [7]:
signal_peptide = signal_peptide.lower()
signal_peptide


Out[7]:
'maskatlllaftllfatcia'

In [8]:
dna_seq = 'GCTAGTAATGTG'
m_rna_seq = dna_seq.replace('T','U')
m_rna_seq


Out[8]:
'GCUAGUAAUGUG'

In [9]:
dna_seq


Out[9]:
'GCTAGTAATGTG'

In [10]:
c = dna_seq.count("C")
g = dna_seq.count("G")
(c+g)/len(dna_seq)*100


Out[10]:
41.66666666666667

In [11]:
m_rna_seq


Out[11]:
'GCUAGUAAUGUG'

In [12]:
m_rna_seq.find('AUG')


Out[12]:
7

In [13]:
m_rna_seq.find('GGG')


Out[13]:
-1

In [14]:
'This string has words separated by spaces'.split()


Out[14]:
['This', 'string', 'has', 'words', 'separated', 'by', 'spaces']

In [15]:
"Alex Doe,5555-2333,nobody@example.com".split()


Out[15]:
['Alex', 'Doe,5555-2333,nobody@example.com']

In [16]:
"Alex Doe,5555-2333,nobody@example.com".split(",")


Out[16]:
['Alex Doe', '5555-2333', 'nobody@example.com']

In [17]:
''.join(['A','C','A','T'])


Out[17]:
'ACAT'

Lists

List Is a Basic Datatype in Python


In [18]:
'Alex Doe,5555-2333,hi@example.com'.split(',')


Out[18]:
['Alex Doe', '5555-2333', 'hi@example.com']

In [19]:
first_list = [1, 2, 3, 4, 5]

In [20]:
other_list = [1, 'two', 3, 4, 'last']

In [21]:
nested_list = [1, 'two', first_list, 4, 'last']
nested_list


Out[21]:
[1, 'two', [1, 2, 3, 4, 5], 4, 'last']

In [22]:
empty_list = []
empty_list


Out[22]:
[]

In [23]:
first_list = [1, 2, 3, 4, 5]
first_list[0]


Out[23]:
1

In [24]:
first_list[1]


Out[24]:
2

In [25]:
first_list = [1, 2, 3, 4, 5]
first_list[-1]


Out[25]:
5

In [26]:
first_list[-4]


Out[26]:
2

In [27]:
aseq = "atggctaggc"
list(aseq)


Out[27]:
['a', 't', 'g', 'g', 'c', 't', 'a', 'g', 'g', 'c']

In [28]:
samples = ['red'] * 5
samples


Out[28]:
['red', 'red', 'red', 'red', 'red']

In [29]:
samples = [None] * 5
samples


Out[29]:
[None, None, None, None, None]

In [30]:
a = [0, 1, 2, 3, 4, 5]

In [31]:
[3*x for x in a]


Out[31]:
[0, 3, 6, 9, 12, 15]

In [32]:
animals = ['  King Kong', '  Godzilla ', 'Gamera  ']
[x.strip() for x in animals]


Out[32]:
['King Kong', 'Godzilla', 'Gamera']

In [33]:
animals = ['  King Kong', '  Godzilla ', 'Gamera  ']
[x.strip() for x in animals if 'i' in x]


Out[33]:
['King Kong', 'Godzilla']

Modifying Lists


In [34]:
first_list.append(99)
first_list


Out[34]:
[1, 2, 3, 4, 5, 99]

In [35]:
first_list.insert(2,50)
first_list


Out[35]:
[1, 2, 50, 3, 4, 5, 99]

In [36]:
first_list.extend([6,7,8])
first_list


Out[36]:
[1, 2, 50, 3, 4, 5, 99, 6, 7, 8]

In [37]:
[1,2,3]+[4,5]


Out[37]:
[1, 2, 3, 4, 5]

In [38]:
first_list


Out[38]:
[1, 2, 50, 3, 4, 5, 99, 6, 7, 8]

In [39]:
first_list.pop()


Out[39]:
8

In [40]:
first_list.pop(2)


Out[40]:
50

In [41]:
first_list


Out[41]:
[1, 2, 3, 4, 5, 99, 6, 7]

In [42]:
first_list.remove(99)
first_list


Out[42]:
[1, 2, 3, 4, 5, 6, 7]

In [43]:
first_list


Out[43]:
[1, 2, 3, 4, 5, 6, 7]

In [44]:
first_list.remove(10)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-44-e4a905a58ced> in <module>()
----> 1 first_list.remove(10)

ValueError: list.remove(x): x not in list

In [45]:
a = [1, 2, 3]
b = a
b.pop()


Out[45]:
3

In [46]:
a


Out[46]:
[1, 2]

In [47]:
import copy
a = [1, 2, 3]
b = copy.copy(a)
b.pop()


Out[47]:
3

In [48]:
a


Out[48]:
[1, 2, 3]

In [49]:
a = [1, 2, 3]
b = a[:]
b.pop()


Out[49]:
3

In [50]:
a


Out[50]:
[1, 2, 3]

In [51]:
point = (23, 56, 11)

In [52]:
point.append(3)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-52-690c246d9903> in <module>()
----> 1 point.append(3)

AttributeError: 'tuple' object has no attribute 'append'

In [53]:
point.pop()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-53-f0dcbfa25f33> in <module>()
----> 1 point.pop()

AttributeError: 'tuple' object has no attribute 'pop'

Common Properties of the Sequences


In [5]:
point = (23, 56, 11)
point[0]


Out[5]:
23

In [55]:
point[1]


Out[55]:
56

In [8]:
my_sequence = 'MRVLLVALALLALAASATS'
my_sequence[0]


Out[8]:
'M'

In [65]:
my_sequence[5]


Out[65]:
'V'

In [2]:
parameters = ['UniGene', 'dna', 'Mm.248907', 5]
parameters[2]


Out[2]:
'Mm.248907'

In [6]:
point[-1]


Out[6]:
11

In [61]:
point[-2]


Out[61]:
56

In [66]:
my_sequence[-2]


Out[66]:
'T'

In [67]:
my_sequence[-4]


Out[67]:
'S'

In [9]:
my_sequence[-1]


Out[9]:
'S'

In [69]:
seqdata = ('MRVLLVALALLA', 12, '5FE9EEE8EE2DC2C7')
seqdata[0][5]


Out[69]:
'V'

In [70]:
my_sequence="Python"
my_sequence[0:2]


Out[70]:
'Py'

In [71]:
my_sequence[:2]


Out[71]:
'Py'

In [72]:
my_sequence="Python"
my_sequence[4:6]


Out[72]:
'on'

In [73]:
my_sequence[4:]


Out[73]:
'on'

In [74]:
my_sequence[1:5]


Out[74]:
'ytho'

In [75]:
my_sequence[1:5:2]


Out[75]:
'yh'

In [76]:
my_sequence[::-1]


Out[76]:
'nohtyP'

In [77]:
point = (23, 56, 11)
11 in point


Out[77]:
True

In [78]:
my_sequence = 'MRVLLVALALLALAASATS'
'X' in my_sequence


Out[78]:
False

In [79]:
point = (23, 56, 11)
point2 = (2, 6, 7)
point + point2


Out[79]:
(23, 56, 11, 2, 6, 7)

In [80]:
dna_seq = 'ATGCTAGACGTCCTCAGATAGCCG'
tata_box = 'TATAAA'
tata_box + dna_seq


Out[80]:
'TATAAAATGCTAGACGTCCTCAGATAGCCG'

In [81]:
point + tata_box


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-81-5fab3f8bc73f> in <module>()
----> 1 point + tata_box

TypeError: can only concatenate tuple (not "str") to tuple

In [82]:
point = (23, 56, 11)
len(point)


Out[82]:
3

In [83]:
my_sequence = 'MRVLLVALALLALAASATS'
len(my_sequence)


Out[83]:
19

In [84]:
point


Out[84]:
(23, 56, 11)

In [85]:
max(point)


Out[85]:
56

In [86]:
min(point)


Out[86]:
11

In [87]:
my_sequence = 'MRVLLVALALLALAASATS'
max(my_sequence)


Out[87]:
'V'

In [88]:
min(my_sequence)


Out[88]:
'A'

In [89]:
tata_box = 'TATAAA'
list(tata_box)


Out[89]:
['T', 'A', 'T', 'A', 'A', 'A']

Dictionaries


In [90]:
iupac = {'A':'Ala','C':'Cys','E':'Glu'}
print('C stands for the amino acid {0}'.format(iupac['C']))


C stands for the amino acid Cys

In [91]:
iupac['E']


Out[91]:
'Glu'

In [5]:
rgb = [('red','ff0000'), ('green','00ff00'), ('blue','0000ff')]
colors_d = dict(rgb)
colors_d


Out[5]:
{'blue': '0000ff', 'green': '00ff00', 'red': 'ff0000'}

In [93]:
rgb = dict(red='ff0000', green='00ff00', blue='0000ff')
rgb


Out[93]:
{'blue': '0000ff', 'green': '00ff00', 'red': 'ff0000'}

In [94]:
rgb = {}
rgb['red'] = 'ff0000'
rgb['green'] = '00ff00'
rgb


Out[94]:
{'green': '00ff00', 'red': 'ff0000'}

In [95]:
len(iupac)


Out[95]:
3

In [96]:
iupac['S'] = 'Ser'
len(iupac)


Out[96]:
4

In [97]:
iupac = {'A':'Ala','C':'Cys','E':'Glu'}
iupac


Out[97]:
{'A': 'Ala', 'C': 'Cys', 'E': 'Glu'}

In [98]:
iupac['X'] = 'Xaa'
iupac


Out[98]:
{'A': 'Ala', 'C': 'Cys', 'E': 'Glu', 'X': 'Xaa'}

In [99]:
from collections import OrderedDict
d = OrderedDict()
d['a'] = 'A'
d['b'] = 'B'
d['c'] = 'C'
d


Out[99]:
OrderedDict([('a', 'A'), ('b', 'B'), ('c', 'C')])

In [100]:
iupac


Out[100]:
{'A': 'Ala', 'C': 'Cys', 'E': 'Glu', 'X': 'Xaa'}

In [101]:
iupac.keys()


Out[101]:
dict_keys(['A', 'E', 'X', 'C'])

In [102]:
iupac.values()


Out[102]:
dict_values(['Ala', 'Glu', 'Xaa', 'Cys'])

In [103]:
iupac.values()


Out[103]:
dict_values(['Ala', 'Glu', 'Xaa', 'Cys'])

In [104]:
iupac.keys()


Out[104]:
dict_keys(['A', 'E', 'X', 'C'])

In [105]:
iupac_keys = iupac.keys()
iupac_vals = iupac.values()
iupac.pop('X')


Out[105]:
'Xaa'

In [106]:
iupac_keys


Out[106]:
dict_keys(['A', 'E', 'C'])

In [107]:
iupac_vals


Out[107]:
dict_values(['Ala', 'Glu', 'Cys'])

In [15]:
iupac = {'E': 'Glu', 'X': 'Xaa', 'C': 'Cys', 'A': 'Ala'}
iupac.items()


Out[15]:
dict_items([('E', 'Glu'), ('A', 'Ala'), ('X', 'Xaa'), ('C', 'Cys')])

In [109]:
iupac = {'E': 'Glu', 'X': 'Xaa', 'C': 'Cys', 'A': 'Ala'}
iupac.get('A','No translation available')


Out[109]:
'Ala'

In [110]:
iupac.get('Z','No translation available')


Out[110]:
'No translation available'

In [17]:
iupac.get('Z')

In [114]:
iupac = {'E': 'Glu', 'X': 'Xaa', 'C': 'Cys', 'A': 'Ala'}
del iupac['A']
iupac


Out[114]:
{'C': 'Cys', 'E': 'Glu', 'X': 'Xaa'}

In [117]:
first_set = {'CP0140.1','XJ8113.5','EF3616.3'}

In [118]:
first_set = set()
first_set.add('CP0140.1')
first_set.add('XJ8113.5')
first_set.add('EF3616.3')
first_set


Out[118]:
{'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [119]:
{2*x for x in [1,2,3]}


Out[119]:
{2, 4, 6}

In [120]:
first_set.add('CP0140.1')
first_set


Out[120]:
{'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [121]:
{2*x for x in [1,1,2,2,3,3]}


Out[121]:
{2, 4, 6}

In [122]:
uniques = {2,2,3,4,5,3}
uniques


Out[122]:
{2, 3, 4, 5}

In [123]:
first_set = {'CP0140.1','XJ8113.5','EF3616.3'}
other_set = {'EF3616.3'}
common = first_set.intersection(other_set)
common


Out[123]:
{'EF3616.3'}

In [124]:
common = first_set & other_set
common


Out[124]:
{'EF3616.3'}

In [125]:
first_set = {'CP0140.1','XJ8113.5','EF3616.3'}
other_set = {'AB7416.2'}
first_set.union(other_set)


Out[125]:
{'AB7416.2', 'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [126]:
first_set | other_set


Out[126]:
{'AB7416.2', 'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [127]:
first_set.difference(other_set)


Out[127]:
{'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [128]:
first_set - other_set


Out[128]:
{'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [129]:
other_set - first_set


Out[129]:
{'AB7416.2'}

In [130]:
first_set.symmetric_difference(other_set)


Out[130]:
{'AB7416.2', 'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [132]:
first_set ^ other_set


Out[132]:
{'AB7416.2', 'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [133]:
first_set


Out[133]:
{'CP0140.1', 'EF3616.3', 'XJ8113.5'}

In [134]:
list(first_set)


Out[134]:
['XJ8113.5', 'CP0140.1', 'EF3616.3']

In [135]:
fs = frozenset(['a','b'])
fs


Out[135]:
frozenset({'a', 'b'})

In [136]:
fs.remove('a')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-136-539d6be46eaf> in <module>()
----> 1 fs.remove('a')

AttributeError: 'frozenset' object has no attribute 'remove'

In [137]:
fs.add('c')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-137-76462344c7ad> in <module>()
----> 1 fs.add('c')

AttributeError: 'frozenset' object has no attribute 'add'

Naming Objects


In [1]:
23crm = "1"    # Start with a number
23 = "1"       # Start with a number
Var? = "value" # Has an invalid character (?).
$five = 5      # Has an invalid character ($)
for = 123      # Has a reserved word
if = "data"    # Has a reserved word


  File "<ipython-input-1-967416013cf3>", line 1
    23crm = "1"    # Start with a number
        ^
SyntaxError: invalid syntax

In [142]:
my_sequence = 'MRVLLVALALLALAASATS'
first_list = [1,2,3,4,5]
d= {1:'a',2:'b',3:'c'}
k = d.keys()
point = (23,56,11)
first_set = {'CP0140.1','XJ8113.5','EF3616.3'}
fs = frozenset(['a','b'])

In [10]:
a = 3
b = [1,2,a]

In [11]:
b


Out[11]:
[1, 2, 3]

In [145]:
a = 5
b


Out[145]:
[1, 2, 3]

In [146]:
c = [1, 2, 3]
d = [5, 6, c]

In [147]:
c


Out[147]:
[1, 2, 3]

In [148]:
d


Out[148]:
[5, 6, [1, 2, 3]]

In [149]:
c.pop()


Out[149]:
3

In [150]:
c


Out[150]:
[1, 2]

In [151]:
d


Out[151]:
[5, 6, [1, 2]]

In [152]:
a = 3
b = [1, 2, a]

In [153]:
b


Out[153]:
[1, 2, 3]

In [154]:
a = 5
b


Out[154]:
[1, 2, 3]

In [13]:
c = [1, 2, 3]
d = [5, 6, c]

In [156]:
c


Out[156]:
[1, 2, 3]

In [157]:
d


Out[157]:
[5, 6, [1, 2, 3]]

In [158]:
c.pop()


Out[158]:
3

In [160]:
print(c)


[1, 2]

In [14]:
print(d)


[5, 6, [1, 2, 3]]